Fijamos la semilla para poder reproducir los experimentos. También se fija el numero de CPU’s a utilizar.
set.seed(42)
options(mc.cores = 24)Se importan las librerÃas a utilizar a lo largo de la notebook:
# install.packages(pacman)
# install.packages("https://cran.r-project.org/src/contrib/rstan_2.21.2.tar.gz",repos = NULL,type="source")
# sudo apt-get install libglpk-devlibrary(pacman)
p_load(tidyverse, tidymodels, rsample, rstan, shinystan, rstanarm, devtools)
source('../src/dataset.R')
source('../src/plot.R')
source('../src/model.R')Palmer Penguins
dataset <- load_dataset() %>% mutate_if(is.character, as.factor)
dataset %>% glimpse()## Rows: 344
## Columns: 8
## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <dbl> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g <dbl> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex <fct> male, female, female, NA, female, male, female, male…
## $ year <dbl> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
Se enumeran y describen breve-mente cada variable que forma parte del dataset:
Variables Numéricas:
Variables Categóricas:
missings_summary(dataset)hist_plots(dataset)Observaciones
box_plots(dataset)Observaciones
No se registran valores mas extremos que el mÃnimo y máximo valor en cada variables. Es decir que no encontramos outliers.
outliers(dataset, column='bill_length_mm')## $inf
## [1] 32.1
##
## $sup
## [1] 59.6
outliers(dataset, column='bill_length_mm')## $inf
## [1] 32.1
##
## $sup
## [1] 59.6
outliers(dataset, column='bill_depth_mm')## $inf
## [1] 13.1
##
## $sup
## [1] 21.5
outliers(dataset, column='flipper_length_mm')## $inf
## [1] 172
##
## $sup
## [1] 231
outliers(dataset, column='body_mass_g')## $inf
## [1] 2700
##
## $sup
## [1] 6300
outliers(dataset, column='year')## $inf
## [1] 2007
##
## $sup
## [1] 2009
bar_plots(dataset)Observaciones
dataset <- dataset %>% drop_na()
missings_summary(dataset)## Warning: attributes are not identical across measure variables;
## they will be dropped
corr_plot(dataset %>% dplyr::select(-year))segmented_pairs_plot(dataset, segment_column='species')train_test <- train_test_split(dataset, train_size = 0.7, shuffle = TRUE)## [1] "Train set size: 233"
## [1] "Test set size: 100"
train_set <- train_test[[1]]
test_set <- train_test[[2]]lineal_model_1 <- lm(
body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
data = train_set
)
summary(lineal_model_1)##
## Call:
## lm(formula = body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
## data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -868.52 -260.19 -17.83 239.28 1269.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6074.063 654.637 -9.279 <2e-16 ***
## bill_length_mm 3.530 6.297 0.561 0.576
## bill_depth_mm 12.534 16.232 0.772 0.441
## flipper_length_mm 49.318 2.874 17.157 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 382.4 on 229 degrees of freedom
## Multiple R-squared: 0.7755, Adjusted R-squared: 0.7725
## F-statistic: 263.6 on 3 and 229 DF, p-value: < 2.2e-16
Quitamos los coeficientes que no son significativos para explicar a la variable body_mass_g:
lineal_model_1 <- lm(body_mass_g ~ flipper_length_mm, data = train_set)
summary(lineal_model_1)##
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -857.93 -257.17 -17.05 232.95 1278.39
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5688.500 353.398 -16.10 <2e-16 ***
## flipper_length_mm 49.240 1.749 28.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 381.7 on 231 degrees of freedom
## Multiple R-squared: 0.7743, Adjusted R-squared: 0.7734
## F-statistic: 792.7 on 1 and 231 DF, p-value: < 2.2e-16
bayesion_model_1 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real<lower=0> sigma;
}
model {
// Distribuciones a priori (Fijado por el investigador)
beta0 ~ normal(0, 8000); // Intercept
beta1 ~ normal(0, 100); // flipper_length_mm
sigma ~ exponential(0.1); // Varianza
// Likelihood
y ~ normal(beta0 + beta1 * x, sigma);
}
",
data = list(
obs_count = nrow(train_set),
x = colvalues(train_set, 'flipper_length_mm'),
y = colvalues(train_set, 'body_mass_g')
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
params_1 <- c('beta0', 'beta1', 'sigma')
traceplot(bayesion_model_1, pars = params_1, inc_warmup = TRUE)lm_vs_br_coeficients(lineal_model_1, bayesion_model_1, params_1)vars_1 <- c('flipper_length_mm')
lm_vs_br_models_validation(lineal_model_1, bayesion_model_1, params_1, vars_1, test_set)bayesion_predictor_1 <- BayesianRegressionPredictor.from(bayesion_model_1, params_1, vars_1)
plot_compare_fit(
lineal_model_1,
bayesion_predictor_1,
train_set,
label_1 = 'Regresion Lineal',
label_2 = 'Regresion Bayesiana'
)lineal_model_2 <- lm(
body_mass_g
~ bill_length_mm
+ bill_depth_mm
+ flipper_length_mm
+ sex,
data = train_set
)
summary(lineal_model_2)##
## Call:
## lm(formula = body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm +
## sex, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -755.02 -255.89 -6.03 221.97 989.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1922.6152 729.5654 -2.635 0.00898 **
## bill_length_mm -0.5331 5.4439 -0.098 0.92208
## bill_depth_mm -92.9217 18.2692 -5.086 7.62e-07 ***
## flipper_length_mm 37.2016 2.8209 13.188 < 2e-16 ***
## sexmale 534.1583 59.5409 8.971 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 329.5 on 228 degrees of freedom
## Multiple R-squared: 0.834, Adjusted R-squared: 0.8311
## F-statistic: 286.5 on 4 and 228 DF, p-value: < 2.2e-16
Quitamos los coeficientes que no son significativos para explicar a la variable body_mass_g:
lineal_model_2 <- lm(
body_mass_g
~ bill_depth_mm
+ flipper_length_mm
+ sex,
data = train_set
)
summary(lineal_model_2)##
## Call:
## lm(formula = body_mass_g ~ bill_depth_mm + flipper_length_mm +
## sex, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -758.30 -258.55 -5.04 223.02 989.59
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1912.697 720.936 -2.653 0.00853 **
## bill_depth_mm -93.106 18.133 -5.135 6.04e-07 ***
## flipper_length_mm 37.053 2.373 15.617 < 2e-16 ***
## sexmale 533.673 59.206 9.014 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 328.7 on 229 degrees of freedom
## Multiple R-squared: 0.834, Adjusted R-squared: 0.8319
## F-statistic: 383.6 on 3 and 229 DF, p-value: < 2.2e-16
Construimos una matriz con todas las variables(X) mas el intercept:
to_model_input <- function(df) {
model.matrix(
body_mass_g
~ bill_depth_mm
+ flipper_length_mm
+ sex,
data = df
)
}
train_X <- train_set %>% to_model_input()
test_X <- test_set %>% to_model_input()
head(train_X)## (Intercept) bill_depth_mm flipper_length_mm sexmale
## 1 1 18.8 197 1
## 2 1 14.6 211 0
## 3 1 19.1 195 1
## 4 1 15.9 224 1
## 5 1 18.5 201 1
## 6 1 18.3 195 1
Definimos y corremos el modelo bayesiano:
bayesion_model_2 <- stan(
model_code = "
data {
int<lower=1> obs_count;
int<lower=1> coef_count;
matrix[obs_count,coef_count] X;
vector[obs_count] y;
}
parameters {
vector[coef_count] beta;
real<lower=0> sigma;
}
model {
// Distribuciones a priori
beta[1] ~ normal(0, 3000); // Intecept = beta0 + sexfemale
beta[2] ~ normal(0, 200); // bill_depth_mm
beta[3] ~ normal(0, 200); // flipper_length_mm
beta[4] ~ normal(0, 600); // sexmale
sigma ~ exponential(0.1); // Varianza
// Likelihood
y ~ normal(X * beta, sigma);
}
",
data = list(
obs_count = dim(train_X)[1],
coef_count = dim(train_X)[2],
y = colvalues(train_set, 'body_mass_g'),
X = train_X
),
chains = 3,
iter = 4000,
warmup = 500,
thin = 1
)## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
params_2 <- c('beta[1]', 'beta[2]', 'beta[3]', 'beta[4]', 'sigma')
traceplot(bayesion_model_2, inc_warmup = TRUE, pars = params_2)lineal_model_2$coefficients## (Intercept) bill_depth_mm flipper_length_mm sexmale
## -1912.69656 -93.10594 37.05300 533.67323
br_coeficients(bayesion_model_2, params_2)vars_2 <- c('bill_depth_mm', 'flipper_length_mm', 'sexmale')
lm_vs_br_models_validation(
lineal_model_2,
bayesion_model_2,
params_2,
vars_2,
test_set,
as.data.frame(test_X)
)bayesion_predictor_2 <- BayesianRegressionPredictor.from(bayesion_model_2, params_2, vars_2)
plot_compare_fit(
lineal_model_2,
bayesion_predictor_2,
test_set,
as.data.frame(test_X),
label_1 = 'Regresion Lineal',
label_2 = 'Regresion Bayesiana'
)A continuación vamos a agregar outliers a la variable flipper_length_mm, la cual define la longitud de la aleta del individuo medida en milÃmetros.
Para visualizar los nuevos outliers a continuación graficamos flipper_length_mm vs. body_mass_g antes y después de agregar outliers:
plot_data(train_set)train_set_with_outliers <- train_set %>%
mutate(flipper_length_mm = ifelse(
body_mass_g > 5400 & body_mass_g < 5700,
flipper_length_mm + (flipper_length_mm * runif(1, 0.1, 0.2)),
flipper_length_mm
))
plot_data(train_set_with_outliers)lineal_model_3 <- lm(
body_mass_g ~ flipper_length_mm,
data = train_set_with_outliers
)
summary(lineal_model_3)##
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm, data = train_set_with_outliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -900.95 -349.47 -38.43 314.16 1234.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2007.392 290.275 -6.915 4.52e-11 ***
## flipper_length_mm 30.518 1.411 21.628 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 462 on 231 degrees of freedom
## Multiple R-squared: 0.6694, Adjusted R-squared: 0.668
## F-statistic: 467.8 on 1 and 231 DF, p-value: < 2.2e-16
Comparemos el ajuste del modelo lineal ajustando en un dataset de train con y sin outliers:
plot_compare_fit(
lineal_model_1,
lineal_model_3,
train_set_with_outliers,
label_1='Regresión Lineal SIN outliers',
label_2='Regresión Lineal CON outliters'
)Se aprecia que el modelo entrenado en train_set_outliers esta apalancado por las observaciones atÃpicas
Entrenamos una regresión lineal múltiple robusta para intentar de disminuir el efecto de los nuevos outliers:
robust_lineal_model_3 <- rlm(
body_mass_g ~ flipper_length_mm,
data = train_set_with_outliers
)plot_compare_fit(
lineal_model_1,
robust_lineal_model_3,
train_set_with_outliers,
label_1='Regresión Lineal SIN outliers',
label_2='Regresión Lineal Robusta CON outliters'
)Gráficamente no se llega a distinguir pero el modelo robusto termina ajustando mejor que el modelo lineal clásico. Esto se puede observar cuando comparamos el RMSE/MAE sobre train y test.
lm_vs_lm_models_validation(lineal_model_3, robust_lineal_model_3, train_set_with_outliers)## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
## Warning in actual - predicted: longer object length is not a multiple of shorter
## object length
lm_vs_lm_models_validation(lineal_model_3, robust_lineal_model_3, test_set)bayesion_model_3 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real<lower=0> sigma;
}
model {
// Distribuciones a priori (Fijado por el investigador)
beta0 ~ normal(0, 1000); // Intercept
beta1 ~ normal(0, 100); // flipper_length_mm
sigma ~ exponential(0.1); // Varianza
// Likelihood
y ~ normal(beta0 + beta1 * x, sigma);
}
",
data = list(
obs_count = nrow(train_set_with_outliers),
x = colvalues(train_set_with_outliers, 'flipper_length_mm'),
y = colvalues(train_set_with_outliers, 'body_mass_g')
),
chains = 3,
iter = 1000,
warmup = 300,
thin = 1
)## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
params_3 <- c('beta0', 'beta1', 'sigma')
traceplot(bayesion_model_3, pars = params_3, inc_warmup = TRUE)lm_vs_br_coeficients(robust_lineal_model_3, bayesion_model_3, params_3)vars_3 <- c('flipper_length_mm')
lm_vs_br_models_validation(robust_lineal_model_3, bayesion_model_3, params_3, vars_3, test_set)bayesion_predictor_3 <- BayesianRegressionPredictor.from(bayesion_model_3, params_3, vars_3)
plot_compare_fit(
robust_lineal_model_3,
bayesion_predictor_3,
train_set,
label_1='Regresion Lineal Robusta CON outliers',
label_2='Regresion Bayesiana CON outliers'
)bayesion_model_3.1 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real<lower=0> sigma;
}
model {
// Distribuciones a priori (Fijado por el investigador)
beta0 ~ normal(-5700, 50); // Intercept (Muy informativa!!!)
beta1 ~ normal(49, 20); // flipper_length_mm (Muy informativa!!!)
sigma ~ exponential(0.1); // Varianza
// Likelihood
y ~ student_t(2, beta0 + beta1 * x, sigma);
}
",
data = list(
obs_count = nrow(train_set_with_outliers),
x = colvalues(train_set_with_outliers, 'flipper_length_mm'),
y = colvalues(train_set_with_outliers, 'body_mass_g')
),
chains = 3,
iter = 1000,
warmup = 300,
thin = 1
)## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
params_3 <- c('beta0', 'beta1', 'sigma')
traceplot(bayesion_model_3.1, pars = params_3, inc_warmup = TRUE)lm_vs_br_coeficients(lineal_model_1, bayesion_model_3.1, params_3)vars_3 <- c('flipper_length_mm')
lm_vs_br_models_validation(lineal_model_1, bayesion_model_3.1, params_3, vars_3, test_set)bayesion_predictor_3.1 <- BayesianRegressionPredictor.from(bayesion_model_3.1, params_3, vars_3)
plot_compare_fit(
lineal_model_1,
bayesion_predictor_3.1,
train_set_with_outliers,
label_1='Regresion Lineal SIN outliers',
label_2='Regresion Bayesiana CON outliers'
)En este aso entrenamos solo con el 10% de lo datos.
train_test <- train_test_split(dataset, train_size = 0.05, shuffle = TRUE)## [1] "Train set size: 16"
## [1] "Test set size: 317"
train_set_4 <- train_test[[1]]
test_set_4 <- train_test[[2]]plot_data(train_set_4)lineal_model_4 <- lm(
body_mass_g ~ flipper_length_mm,
data = train_set_4
)bayesion_model_4 <- stan(
model_code = "
data {
int<lower=1> obs_count;
vector<lower=1>[obs_count] x;
vector[obs_count] y;
}
parameters {
real beta0;
real beta1;
real<lower=0> sigma;
}
model {
// Distribuciones a priori
beta0 ~ normal(0, 8000); // Intercept
beta1 ~ normal(0, 100);
sigma ~ exponential(0.1);
// Likelihood
y ~ normal(beta0 + beta1 * x, sigma);
}
",
data = list(
obs_count = nrow(train_set_4),
x = colvalues(train_set_4, 'flipper_length_mm'),
y = colvalues(train_set_4, 'body_mass_g')
),
chains = 3,
iter = 300,
warmup = 180,
thin = 1
)## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
params_4 <- c('beta0', 'beta1', 'sigma')
traceplot(bayesion_model_4, pars = params_4, inc_warmup = TRUE)Coeficientes de la regresión múltiple:
lineal_model_4$coefficients## (Intercept) flipper_length_mm
## -5760.9598 50.3946
Coeficientes descubiertos por la regresión múltiple bayesiana:
for(param in params_4) print(get_posterior_mean(bayesion_model_4, par=param)[4])## [1] -5664.614
## [1] 49.90622
## [1] 257.5905
vars_4 <- c('flipper_length_mm')
lm_vs_br_models_validation(lineal_model_4, bayesion_model_4, params_4, vars_4, test_set)bayesion_predictor_4 <- BayesianRegressionPredictor.from(bayesion_model_4, params_4, vars_4)
plot_compare_fit(
lineal_model_4,
bayesion_predictor_4,
train_set,
label_1='Regresion Lineal',
label_2='Regresion Bayesiana'
)Definimos una distribución uniforme para el beta asociado a la variable flipper_length_mm.
## Running /usr/local/lib/R/bin/R CMD SHLIB foo.c
## gcc -I"/usr/local/lib/R/include" -DNDEBUG -I"/usr/local/lib/R/site-library/Rcpp/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/" -I"/usr/local/lib/R/site-library/RcppEigen/include/unsupported" -I"/usr/local/lib/R/site-library/BH/include" -I"/usr/local/lib/R/site-library/StanHeaders/include/src/" -I"/usr/local/lib/R/site-library/StanHeaders/include/" -I"/usr/local/lib/R/site-library/RcppParallel/include/" -I"/usr/local/lib/R/site-library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c foo.c -o foo.o
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:88,
## from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:1: error: unknown type name ‘namespace’
## 628 | namespace Eigen {
## | ^~~~~~~~~
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/src/Core/util/Macros.h:628:17: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
## 628 | namespace Eigen {
## | ^
## In file included from /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Dense:1,
## from /usr/local/lib/R/site-library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13,
## from <command-line>:
## /usr/local/lib/R/site-library/RcppEigen/include/Eigen/Core:96:10: fatal error: complex: No such file or directory
## 96 | #include <complex>
## | ^~~~~~~~~
## compilation terminated.
## make: *** [/usr/local/lib/R/etc/Makeconf:168: foo.o] Error 1
br_vs_br_coeficients(bayesion_model_1, bayesion_model_5, params_5)lm_vs_br_models_validation(lineal_model_1, bayesion_model_1, params_1, vars_1, test_set)vars_5 <- c('flipper_length_mm')
lm_vs_br_models_validation(lineal_model_1, bayesion_model_5, params_5, vars_5, test_set)bayesion_predictor_5 <- BayesianRegressionPredictor.from(bayesion_model_5, params_5, vars_5)
plot_compare_fit(
bayesion_predictor_1,
bayesion_predictor_5,
train_set,
label_1='Regresion Bayesiana con dist informativa',
label_2='Regresion Bayesiana con dist menos informativa'
)COMPLETAR